doc: Add helper for converting DocBook files to Markdown

2024-11-26 00:43:20 +00:00 · 2021-07-12 22:32:21 +02:00 · 2021-07-12 22:32:21 +02:00 · da95ab11b4
commit da95ab11b4
parent 04b59b0328
5 changed files with 230 additions and 1 deletions
--- a/doc/Makefile
+++ b/doc/Makefile
@ -3,7 +3,7 @@ MD_TARGETS=$(addsuffix .xml, $(basename $(shell find . -type f -regex '.*\.md$$'
 PANDOC ?= pandoc

 pandoc_media_dir = media
-# NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh).
+# NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh) and conversion script (/maintainers/scripts/db-to-md.sh).
 # TODO: Remove raw-attribute when we can get rid of DocBook altogether.
 pandoc_commonmark_enabled_extensions = +attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
 # Not needed:
--- a/maintainers/scripts/db-to-md.sh
+++ b/maintainers/scripts/db-to-md.sh
@ -0,0 +1,88 @@
+#! /usr/bin/env nix-shell
+#! nix-shell -I nixpkgs=. -i bash -p pandoc
+
+# This script is temporarily needed while we transition the manual to
+# CommonMark. It converts DocBook files into our CommonMark flavour.
+
+debug=
+files=()
+
+while [ "$#" -gt 0 ]; do
+    i="$1"; shift 1
+    case "$i" in
+      --debug)
+        debug=1
+        ;;
+      *)
+        files+=("$i")
+        ;;
+    esac
+done
+
+echo "WARNING: This is an experimental script and might not preserve all formatting." > /dev/stderr
+echo "Please report any issues you discover." > /dev/stderr
+
+outExtension="md"
+if [[ $debug ]]; then
+    outExtension="json"
+fi
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+# NOTE: Keep in sync with Nixpkgs manual (/doc/Makefile).
+# TODO: Remove raw-attribute when we can get rid of DocBook altogether.
+pandoc_commonmark_enabled_extensions=+attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
+targetLang="commonmark${pandoc_commonmark_enabled_extensions}+smart"
+if [[ $debug ]]; then
+    targetLang=json
+fi
+pandoc_flags=(
+    # Not needed:
+    # - diagram-generator.lua (we do not support that in NixOS manual to limit dependencies)
+    # - media extraction (was only required for diagram generator)
+    # - myst-reader/roles.lua (only relevant for MyST → DocBook)
+    # - link-unix-man-references.lua (links should only be added to display output)
+    # - docbook-writer/rst-roles.lua (only relevant for → DocBook)
+    # - docbook-writer/labelless-link-is-xref.lua (only relevant for → DocBook)
+    "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/docbook-reader/citerefentry-to-rst-role.lua"
+    "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/myst-writer/roles.lua"
+    "--lua-filter=$DIR/doc/unknown-code-language.lua"
+    -f docbook
+    -t "$targetLang"
+    --tab-stop=2
+    --wrap=none
+)
+
+for file in "${files[@]}"; do
+    if [[ ! -f "$file" ]]; then
+        echo "db-to-md.sh: $file does not exist" > /dev/stderr
+        exit 1
+    else
+    rootElement=$(xmllint --xpath 'name(//*)' "$file")
+
+    if [[ $rootElement = chapter ]]; then
+        extension=".chapter.$outExtension"
+    elif [[ $rootElement = section ]]; then
+        extension=".section.$outExtension"
+    else
+        echo "db-to-md.sh: $file contains an unsupported root element $rootElement" > /dev/stderr
+        exit 1
+    fi
+
+    outFile="${file%".section.xml"}"
+    outFile="${outFile%".chapter.xml"}"
+    outFile="${outFile%".xml"}$extension"
+    temp1=$(mktemp)
+    $DIR/doc/escape-code-markup.py "$file" "$temp1"
+    if [[ $debug ]]; then
+        echo "Converted $file to $temp1" > /dev/stderr
+    fi
+    temp2=$(mktemp)
+    $DIR/doc/replace-xrefs-by-empty-links.py "$temp1" "$temp2"
+    if [[ $debug ]]; then
+        echo "Converted $temp1 to $temp2" > /dev/stderr
+    fi
+    pandoc "$temp2" -o "$outFile" "${pandoc_flags[@]}"
+    echo "Converted $file to $outFile" > /dev/stderr
+  fi
+done
--- a/maintainers/scripts/doc/escape-code-markup.py
+++ b/maintainers/scripts/doc/escape-code-markup.py
@ -0,0 +1,97 @@
+#! /usr/bin/env nix-shell
+#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
+
+"""
+Pandoc will strip any markup within code elements so
+let’s escape them so that they can be handled manually.
+"""
+
+import lxml.etree as ET
+import re
+import sys
+
+def replace_element_by_text(el: ET.Element, text: str) -> None:
+    """
+    Author: bernulf
+    Source: https://stackoverflow.com/a/10520552/160386
+    SPDX-License-Identifier: CC-BY-SA-3.0
+    """
+    text = text + (el.tail or "")
+    parent = el.getparent()
+    if parent is not None:
+        previous = el.getprevious()
+        if previous is not None:
+            previous.tail = (previous.tail or "") + text
+        else:
+            parent.text = (parent.text or "") + text
+        parent.remove(el)
+
+DOCBOOK_NS = "http://docbook.org/ns/docbook"
+
+# List of elements that pandoc’s DocBook reader strips markup from.
+# https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs
+code_elements = [
+    # CodeBlock
+    "literallayout",
+    "screen",
+    "programlisting",
+    # Code (inline)
+    "classname",
+    "code",
+    "filename",
+    "envar",
+    "literal",
+    "computeroutput",
+    "prompt",
+    "parameter",
+    "option",
+    "markup",
+    "wordasword",
+    "command",
+    "varname",
+    "function",
+    "type",
+    "symbol",
+    "constant",
+    "userinput",
+    "systemitem",
+]
+
+XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"')
+ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>')
+
+def remove_xmlns(match: re.Match) -> str:
+    """
+    Removes xmlns attributes.
+
+    Expects a match containing an opening tag.
+    """
+    return XMLNS_REGEX.sub('', match.group(0))
+
+if __name__ == '__main__':
+    assert len(sys.argv) >= 3, "usage: escape-code-markup.py <input> <output>"
+
+    tree = ET.parse(sys.argv[1])
+    name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements])
+
+    for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"):
+        text = ET.tostring(markup, encoding=str)
+
+        # tostring adds xmlns attributes to the element we want to stringify
+        # as if it was supposed to be usable standalone.
+        # We are just converting it to CDATA so we do not care.
+        # Let’s strip the namespace declarations to keep the code clean.
+        #
+        # Note that this removes even namespaces that were potentially
+        # in the original file. Though, that should be very rare –
+        # most of the time, we will stringify empty DocBook elements
+        # like <xref> or <co> or, at worst, <link> with xlink:href attribute.
+        #
+        # Also note that the regex expects the root element to be first
+        # thing in the string. But that should be fine, the tostring method
+        # does not produce XML declaration or doctype by default.
+        text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text)
+
+        replace_element_by_text(markup, text)
+
+    tree.write(sys.argv[2])
--- a/maintainers/scripts/doc/replace-xrefs-by-empty-links.py
+++ b/maintainers/scripts/doc/replace-xrefs-by-empty-links.py
@ -0,0 +1,32 @@
+#! /usr/bin/env nix-shell
+#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
+
+"""
+Pandoc will try to resolve xrefs and replace them with regular links.
+let’s replace them with links with empty labels which MyST
+and our pandoc filters recognize as cross-references.
+"""
+
+import lxml.etree as ET
+import sys
+
+XLINK_NS = "http://www.w3.org/1999/xlink"
+
+ns = {
+    "db": "http://docbook.org/ns/docbook",
+}
+
+
+if __name__ == '__main__':
+    assert len(sys.argv) >= 3, "usage: replace-xrefs-by-empty-links.py <input> <output>"
+
+    tree = ET.parse(sys.argv[1])
+    for xref in tree.findall(".//db:xref", ns):
+        text = ET.tostring(xref, encoding=str)
+        parent = xref.getparent()
+        link = parent.makeelement('link')
+        target_name = xref.get("linkend")
+        link.set(f"{{{XLINK_NS}}}href", f"#{target_name}")
+        parent.replace(xref, link)
+
+    tree.write(sys.argv[2])
--- a/maintainers/scripts/doc/unknown-code-language.lua
+++ b/maintainers/scripts/doc/unknown-code-language.lua
@ -0,0 +1,12 @@
+--[[
+Adds “unknown” class to CodeBlock AST nodes without any classes.
+
+This will cause Pandoc to use fenced code block, which we prefer.
+]]
+
+function CodeBlock(elem)
+  if #elem.classes == 0 then
+    elem.classes:insert('unknown')
+    return elem
+  end
+end