mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-11-26 08:53:21 +00:00
doc: Add helper for converting DocBook files to Markdown
This commit is contained in:
parent
04b59b0328
commit
da95ab11b4
@ -3,7 +3,7 @@ MD_TARGETS=$(addsuffix .xml, $(basename $(shell find . -type f -regex '.*\.md$$'
|
|||||||
PANDOC ?= pandoc
|
PANDOC ?= pandoc
|
||||||
|
|
||||||
pandoc_media_dir = media
|
pandoc_media_dir = media
|
||||||
# NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh).
|
# NOTE: Keep in sync with NixOS manual (/nixos/doc/manual/md-to-db.sh) and conversion script (/maintainers/scripts/db-to-md.sh).
|
||||||
# TODO: Remove raw-attribute when we can get rid of DocBook altogether.
|
# TODO: Remove raw-attribute when we can get rid of DocBook altogether.
|
||||||
pandoc_commonmark_enabled_extensions = +attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
|
pandoc_commonmark_enabled_extensions = +attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
|
||||||
# Not needed:
|
# Not needed:
|
||||||
|
88
maintainers/scripts/db-to-md.sh
Executable file
88
maintainers/scripts/db-to-md.sh
Executable file
@ -0,0 +1,88 @@
|
|||||||
|
#! /usr/bin/env nix-shell
|
||||||
|
#! nix-shell -I nixpkgs=. -i bash -p pandoc
|
||||||
|
|
||||||
|
# This script is temporarily needed while we transition the manual to
|
||||||
|
# CommonMark. It converts DocBook files into our CommonMark flavour.
|
||||||
|
|
||||||
|
debug=
|
||||||
|
files=()
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
i="$1"; shift 1
|
||||||
|
case "$i" in
|
||||||
|
--debug)
|
||||||
|
debug=1
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
files+=("$i")
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "WARNING: This is an experimental script and might not preserve all formatting." > /dev/stderr
|
||||||
|
echo "Please report any issues you discover." > /dev/stderr
|
||||||
|
|
||||||
|
outExtension="md"
|
||||||
|
if [[ $debug ]]; then
|
||||||
|
outExtension="json"
|
||||||
|
fi
|
||||||
|
|
||||||
|
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||||
|
|
||||||
|
# NOTE: Keep in sync with Nixpkgs manual (/doc/Makefile).
|
||||||
|
# TODO: Remove raw-attribute when we can get rid of DocBook altogether.
|
||||||
|
pandoc_commonmark_enabled_extensions=+attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute
|
||||||
|
targetLang="commonmark${pandoc_commonmark_enabled_extensions}+smart"
|
||||||
|
if [[ $debug ]]; then
|
||||||
|
targetLang=json
|
||||||
|
fi
|
||||||
|
pandoc_flags=(
|
||||||
|
# Not needed:
|
||||||
|
# - diagram-generator.lua (we do not support that in NixOS manual to limit dependencies)
|
||||||
|
# - media extraction (was only required for diagram generator)
|
||||||
|
# - myst-reader/roles.lua (only relevant for MyST → DocBook)
|
||||||
|
# - link-unix-man-references.lua (links should only be added to display output)
|
||||||
|
# - docbook-writer/rst-roles.lua (only relevant for → DocBook)
|
||||||
|
# - docbook-writer/labelless-link-is-xref.lua (only relevant for → DocBook)
|
||||||
|
"--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/docbook-reader/citerefentry-to-rst-role.lua"
|
||||||
|
"--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/myst-writer/roles.lua"
|
||||||
|
"--lua-filter=$DIR/doc/unknown-code-language.lua"
|
||||||
|
-f docbook
|
||||||
|
-t "$targetLang"
|
||||||
|
--tab-stop=2
|
||||||
|
--wrap=none
|
||||||
|
)
|
||||||
|
|
||||||
|
for file in "${files[@]}"; do
|
||||||
|
if [[ ! -f "$file" ]]; then
|
||||||
|
echo "db-to-md.sh: $file does not exist" > /dev/stderr
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
rootElement=$(xmllint --xpath 'name(//*)' "$file")
|
||||||
|
|
||||||
|
if [[ $rootElement = chapter ]]; then
|
||||||
|
extension=".chapter.$outExtension"
|
||||||
|
elif [[ $rootElement = section ]]; then
|
||||||
|
extension=".section.$outExtension"
|
||||||
|
else
|
||||||
|
echo "db-to-md.sh: $file contains an unsupported root element $rootElement" > /dev/stderr
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
outFile="${file%".section.xml"}"
|
||||||
|
outFile="${outFile%".chapter.xml"}"
|
||||||
|
outFile="${outFile%".xml"}$extension"
|
||||||
|
temp1=$(mktemp)
|
||||||
|
$DIR/doc/escape-code-markup.py "$file" "$temp1"
|
||||||
|
if [[ $debug ]]; then
|
||||||
|
echo "Converted $file to $temp1" > /dev/stderr
|
||||||
|
fi
|
||||||
|
temp2=$(mktemp)
|
||||||
|
$DIR/doc/replace-xrefs-by-empty-links.py "$temp1" "$temp2"
|
||||||
|
if [[ $debug ]]; then
|
||||||
|
echo "Converted $temp1 to $temp2" > /dev/stderr
|
||||||
|
fi
|
||||||
|
pandoc "$temp2" -o "$outFile" "${pandoc_flags[@]}"
|
||||||
|
echo "Converted $file to $outFile" > /dev/stderr
|
||||||
|
fi
|
||||||
|
done
|
97
maintainers/scripts/doc/escape-code-markup.py
Executable file
97
maintainers/scripts/doc/escape-code-markup.py
Executable file
@ -0,0 +1,97 @@
|
|||||||
|
#! /usr/bin/env nix-shell
|
||||||
|
#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
|
||||||
|
|
||||||
|
"""
|
||||||
|
Pandoc will strip any markup within code elements so
|
||||||
|
let’s escape them so that they can be handled manually.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import lxml.etree as ET
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def replace_element_by_text(el: ET.Element, text: str) -> None:
|
||||||
|
"""
|
||||||
|
Author: bernulf
|
||||||
|
Source: https://stackoverflow.com/a/10520552/160386
|
||||||
|
SPDX-License-Identifier: CC-BY-SA-3.0
|
||||||
|
"""
|
||||||
|
text = text + (el.tail or "")
|
||||||
|
parent = el.getparent()
|
||||||
|
if parent is not None:
|
||||||
|
previous = el.getprevious()
|
||||||
|
if previous is not None:
|
||||||
|
previous.tail = (previous.tail or "") + text
|
||||||
|
else:
|
||||||
|
parent.text = (parent.text or "") + text
|
||||||
|
parent.remove(el)
|
||||||
|
|
||||||
|
DOCBOOK_NS = "http://docbook.org/ns/docbook"
|
||||||
|
|
||||||
|
# List of elements that pandoc’s DocBook reader strips markup from.
|
||||||
|
# https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs
|
||||||
|
code_elements = [
|
||||||
|
# CodeBlock
|
||||||
|
"literallayout",
|
||||||
|
"screen",
|
||||||
|
"programlisting",
|
||||||
|
# Code (inline)
|
||||||
|
"classname",
|
||||||
|
"code",
|
||||||
|
"filename",
|
||||||
|
"envar",
|
||||||
|
"literal",
|
||||||
|
"computeroutput",
|
||||||
|
"prompt",
|
||||||
|
"parameter",
|
||||||
|
"option",
|
||||||
|
"markup",
|
||||||
|
"wordasword",
|
||||||
|
"command",
|
||||||
|
"varname",
|
||||||
|
"function",
|
||||||
|
"type",
|
||||||
|
"symbol",
|
||||||
|
"constant",
|
||||||
|
"userinput",
|
||||||
|
"systemitem",
|
||||||
|
]
|
||||||
|
|
||||||
|
XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"')
|
||||||
|
ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>')
|
||||||
|
|
||||||
|
def remove_xmlns(match: re.Match) -> str:
|
||||||
|
"""
|
||||||
|
Removes xmlns attributes.
|
||||||
|
|
||||||
|
Expects a match containing an opening tag.
|
||||||
|
"""
|
||||||
|
return XMLNS_REGEX.sub('', match.group(0))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
assert len(sys.argv) >= 3, "usage: escape-code-markup.py <input> <output>"
|
||||||
|
|
||||||
|
tree = ET.parse(sys.argv[1])
|
||||||
|
name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements])
|
||||||
|
|
||||||
|
for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"):
|
||||||
|
text = ET.tostring(markup, encoding=str)
|
||||||
|
|
||||||
|
# tostring adds xmlns attributes to the element we want to stringify
|
||||||
|
# as if it was supposed to be usable standalone.
|
||||||
|
# We are just converting it to CDATA so we do not care.
|
||||||
|
# Let’s strip the namespace declarations to keep the code clean.
|
||||||
|
#
|
||||||
|
# Note that this removes even namespaces that were potentially
|
||||||
|
# in the original file. Though, that should be very rare –
|
||||||
|
# most of the time, we will stringify empty DocBook elements
|
||||||
|
# like <xref> or <co> or, at worst, <link> with xlink:href attribute.
|
||||||
|
#
|
||||||
|
# Also note that the regex expects the root element to be first
|
||||||
|
# thing in the string. But that should be fine, the tostring method
|
||||||
|
# does not produce XML declaration or doctype by default.
|
||||||
|
text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text)
|
||||||
|
|
||||||
|
replace_element_by_text(markup, text)
|
||||||
|
|
||||||
|
tree.write(sys.argv[2])
|
32
maintainers/scripts/doc/replace-xrefs-by-empty-links.py
Executable file
32
maintainers/scripts/doc/replace-xrefs-by-empty-links.py
Executable file
@ -0,0 +1,32 @@
|
|||||||
|
#! /usr/bin/env nix-shell
|
||||||
|
#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
|
||||||
|
|
||||||
|
"""
|
||||||
|
Pandoc will try to resolve xrefs and replace them with regular links.
|
||||||
|
let’s replace them with links with empty labels which MyST
|
||||||
|
and our pandoc filters recognize as cross-references.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import lxml.etree as ET
|
||||||
|
import sys
|
||||||
|
|
||||||
|
XLINK_NS = "http://www.w3.org/1999/xlink"
|
||||||
|
|
||||||
|
ns = {
|
||||||
|
"db": "http://docbook.org/ns/docbook",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
assert len(sys.argv) >= 3, "usage: replace-xrefs-by-empty-links.py <input> <output>"
|
||||||
|
|
||||||
|
tree = ET.parse(sys.argv[1])
|
||||||
|
for xref in tree.findall(".//db:xref", ns):
|
||||||
|
text = ET.tostring(xref, encoding=str)
|
||||||
|
parent = xref.getparent()
|
||||||
|
link = parent.makeelement('link')
|
||||||
|
target_name = xref.get("linkend")
|
||||||
|
link.set(f"{{{XLINK_NS}}}href", f"#{target_name}")
|
||||||
|
parent.replace(xref, link)
|
||||||
|
|
||||||
|
tree.write(sys.argv[2])
|
12
maintainers/scripts/doc/unknown-code-language.lua
Normal file
12
maintainers/scripts/doc/unknown-code-language.lua
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
--[[
|
||||||
|
Adds “unknown” class to CodeBlock AST nodes without any classes.
|
||||||
|
|
||||||
|
This will cause Pandoc to use fenced code block, which we prefer.
|
||||||
|
]]
|
||||||
|
|
||||||
|
function CodeBlock(elem)
|
||||||
|
if #elem.classes == 0 then
|
||||||
|
elem.classes:insert('unknown')
|
||||||
|
return elem
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue
Block a user