doc: Add helper for converting DocBook files to Markdown

2024-11-26 08:53:21 +00:00 · 2021-07-12 22:32:21 +02:00 · 2021-07-12 22:32:21 +02:00 · da95ab11b4
commit da95ab11b4
parent 04b59b0328
5 changed files with 230 additions and 1 deletions
--- a/doc/Makefile
+++ b/doc/Makefile
@ -3,7 +3,7 @@ MD_TARGETS=$(addsuffix .xml, $(basename $(shell find . -type f -regex '.*\.md$$'
 PANDOC ?= pandoc
 pandoc_media_dir = media
-# NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh).
+# NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh) and conversion script (/maintainers/scripts/db-to-md.sh).
 # TODO: Remove raw-attribute when we can get rid of DocBook altogether.
 pandoc_commonmark_enabled_extensions = +attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
 # Not needed:
--- a/maintainers/scripts/db-to-md.sh
+++ b/maintainers/scripts/db-to-md.sh
@ -0,0 +1,88 @@
 #! /usr/bin/env nix-shell
 #! nix-shell -I nixpkgs=. -i bash -p pandoc
 # This script is temporarily needed while we transition the manual to
 # CommonMark. It converts DocBook files into our CommonMark flavour.
 debug=
 files=()
 while [ "$#" -gt 0 ]; do
    i="$1"; shift 1
    case "$i" in
      --debug)
        debug=1
        ;;
      *)
        files+=("$i")
        ;;
    esac
 done
 echo "WARNING: This is an experimental script and might not preserve all formatting." > /dev/stderr
 echo "Please report any issues you discover." > /dev/stderr
 outExtension="md"
 if [[ $debug ]]; then
    outExtension="json"
 fi
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 # NOTE: Keep in sync with Nixpkgs manual (/doc/Makefile).
 # TODO: Remove raw-attribute when we can get rid of DocBook altogether.
 pandoc_commonmark_enabled_extensions=+attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
 targetLang="commonmark${pandoc_commonmark_enabled_extensions}+smart"
 if [[ $debug ]]; then
    targetLang=json
 fi
 pandoc_flags=(
    # Not needed:
    # - diagram-generator.lua (we do not support that in NixOS manual to limit dependencies)
    # - media extraction (was only required for diagram generator)
    # - myst-reader/roles.lua (only relevant for MyST → DocBook)
    # - link-unix-man-references.lua (links should only be added to display output)
    # - docbook-writer/rst-roles.lua (only relevant for → DocBook)
    # - docbook-writer/labelless-link-is-xref.lua (only relevant for → DocBook)
    "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/docbook-reader/citerefentry-to-rst-role.lua"
    "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/myst-writer/roles.lua"
    "--lua-filter=$DIR/doc/unknown-code-language.lua"
    -f docbook
    -t "$targetLang"
    --tab-stop=2
    --wrap=none
 )
 for file in "${files[@]}"; do
    if [[ ! -f "$file" ]]; then
        echo "db-to-md.sh: $file does not exist" > /dev/stderr
        exit 1
    else
    rootElement=$(xmllint --xpath 'name(//*)' "$file")
    if [[ $rootElement = chapter ]]; then
        extension=".chapter.$outExtension"
    elif [[ $rootElement = section ]]; then
        extension=".section.$outExtension"
    else
        echo "db-to-md.sh: $file contains an unsupported root element $rootElement" > /dev/stderr
        exit 1
    fi
    outFile="${file%".section.xml"}"
    outFile="${outFile%".chapter.xml"}"
    outFile="${outFile%".xml"}$extension"
    temp1=$(mktemp)
    $DIR/doc/escape-code-markup.py "$file" "$temp1"
    if [[ $debug ]]; then
        echo "Converted $file to $temp1" > /dev/stderr
    fi
    temp2=$(mktemp)
    $DIR/doc/replace-xrefs-by-empty-links.py "$temp1" "$temp2"
    if [[ $debug ]]; then
        echo "Converted $temp1 to $temp2" > /dev/stderr
    fi
    pandoc "$temp2" -o "$outFile" "${pandoc_flags[@]}"
    echo "Converted $file to $outFile" > /dev/stderr
  fi
 done
--- a/maintainers/scripts/doc/escape-code-markup.py
+++ b/maintainers/scripts/doc/escape-code-markup.py
@ -0,0 +1,97 @@
 #! /usr/bin/env nix-shell
 #! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
 """
 Pandoc will strip any markup within code elements so
 let’s escape them so that they can be handled manually.
 """
 import lxml.etree as ET
 import re
 import sys
 def replace_element_by_text(el: ET.Element, text: str) -> None:
    """
    Author: bernulf
    Source: https://stackoverflow.com/a/10520552/160386
    SPDX-License-Identifier: CC-BY-SA-3.0
    """
    text = text + (el.tail or "")
    parent = el.getparent()
    if parent is not None:
        previous = el.getprevious()
        if previous is not None:
            previous.tail = (previous.tail or "") + text
        else:
            parent.text = (parent.text or "") + text
        parent.remove(el)
 DOCBOOK_NS = "http://docbook.org/ns/docbook"
 # List of elements that pandoc’s DocBook reader strips markup from.
 # https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs
 code_elements = [
    # CodeBlock
    "literallayout",
    "screen",
    "programlisting",
    # Code (inline)
    "classname",
    "code",
    "filename",
    "envar",
    "literal",
    "computeroutput",
    "prompt",
    "parameter",
    "option",
    "markup",
    "wordasword",
    "command",
    "varname",
    "function",
    "type",
    "symbol",
    "constant",
    "userinput",
    "systemitem",
 ]
 XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"')
 ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>')
 def remove_xmlns(match: re.Match) -> str:
    """
    Removes xmlns attributes.
    Expects a match containing an opening tag.
    """
    return XMLNS_REGEX.sub('', match.group(0))
 if __name__ == '__main__':
    assert len(sys.argv) >= 3, "usage: escape-code-markup.py <input> <output>"
    tree = ET.parse(sys.argv[1])
    name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements])
    for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"):
        text = ET.tostring(markup, encoding=str)
        # tostring adds xmlns attributes to the element we want to stringify
        # as if it was supposed to be usable standalone.
        # We are just converting it to CDATA so we do not care.
        # Let’s strip the namespace declarations to keep the code clean.
        #
        # Note that this removes even namespaces that were potentially
        # in the original file. Though, that should be very rare –
        # most of the time, we will stringify empty DocBook elements
        # like <xref> or <co> or, at worst, <link> with xlink:href attribute.
        #
        # Also note that the regex expects the root element to be first
        # thing in the string. But that should be fine, the tostring method
        # does not produce XML declaration or doctype by default.
        text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text)
        replace_element_by_text(markup, text)
    tree.write(sys.argv[2])
--- a/maintainers/scripts/doc/replace-xrefs-by-empty-links.py
+++ b/maintainers/scripts/doc/replace-xrefs-by-empty-links.py
@ -0,0 +1,32 @@
 #! /usr/bin/env nix-shell
 #! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
 """
 Pandoc will try to resolve xrefs and replace them with regular links.
 let’s replace them with links with empty labels which MyST
 and our pandoc filters recognize as cross-references.
 """
 import lxml.etree as ET
 import sys
 XLINK_NS = "http://www.w3.org/1999/xlink"
 ns = {
    "db": "http://docbook.org/ns/docbook",
 }
 if __name__ == '__main__':
    assert len(sys.argv) >= 3, "usage: replace-xrefs-by-empty-links.py <input> <output>"
    tree = ET.parse(sys.argv[1])
    for xref in tree.findall(".//db:xref", ns):
        text = ET.tostring(xref, encoding=str)
        parent = xref.getparent()
        link = parent.makeelement('link')
        target_name = xref.get("linkend")
        link.set(f"{{{XLINK_NS}}}href", f"#{target_name}")
        parent.replace(xref, link)
    tree.write(sys.argv[2])
--- a/maintainers/scripts/doc/unknown-code-language.lua
+++ b/maintainers/scripts/doc/unknown-code-language.lua
@ -0,0 +1,12 @@
 --[[
 Adds “unknown” class to CodeBlock AST nodes without any classes.
 This will cause Pandoc to use fenced code block, which we prefer.
 ]]
 function CodeBlock(elem)
  if #elem.classes == 0 then
    elem.classes:insert('unknown')
    return elem
  end
 end