Revert "refactor: use lxml instead of string templating in process_file. (#5146)"
All checks were successful
continuous-integration/drone/pr Build is passing

This reverts commit c4b7f0f33c.
This commit is contained in:
2025-07-28 18:00:47 +02:00
parent c4b7f0f33c
commit 63e70f1eb1
14 changed files with 123 additions and 125 deletions

View File

@@ -14,61 +14,76 @@ from build.lib.misc import get_basename, get_version, lang_from_filename
logger = logging.getLogger(__name__)
def _get_xmls(file: Path, parser: etree.XMLParser) -> etree.Element:
def _include_xml(file: Path) -> str:
"""
include second level elements of a given XML file
this emulates the behaviour of the original
build script which wasn't able to load top
level elements from any file
"""
elements = []
work_str = ""
if file.exists():
tree = etree.parse(file, parser)
tree = etree.parse(file)
root = tree.getroot()
# Remove <version> because the filename attribute would otherwise be added
# to this element instead of the actual content element.
for elem in root.xpath("version"):
root.remove(elem)
# Iterate over all elements in root node, add a filename attribute and
# then append the string to work_str
for elem in root.xpath("*"):
elem.set("filename", get_basename(file))
elements.append(elem)
# and then we return the element
return elements
work_str += etree.tostring(elem, encoding="utf-8").decode("utf-8")
return work_str
def _get_attributes(file: Path) -> dict:
def _get_attributes(file: Path) -> str:
"""
get attributes of top level element in a given
XHTML file
"""
work_str = ""
tree = etree.parse(file)
root = tree.getroot()
attributes = root.items()
return dict(attributes)
attributes = root.attrib
for attrib in attributes:
work_str += f'{attrib}="{attributes[attrib]}"\n'
return work_str
def _get_trlist(file: Path) -> etree.Element:
def _list_langs(file: Path) -> str:
"""
list all languages a file exists in by globbing up
the shortname (i.e. file path with file ending omitted)
output is readily formatted for inclusion
in xml stream
"""
trlist = etree.Element("trlist")
for path in file.parent.glob(f"{get_basename(file)}.??{file.suffix}"):
tr = etree.SubElement(trlist, "tr", id=lang_from_filename(path))
tr.text = (
Path(f"global/languages/{lang_from_filename(path)}").read_text().strip()
return "\n".join(
list(
map(
lambda path: (
f'<tr id="{lang_from_filename(path)}">'
+ (
Path(f"global/languages/{lang_from_filename(path)}")
.read_text()
.strip()
)
+ "</tr>"
),
file.parent.glob(f"{get_basename(file)}.??{file.suffix}"),
)
)
return trlist
)
def _get_set(action_file: Path, lang: str, parser: etree.XMLParser) -> etree.Element:
def _auto_sources(action_file: Path, lang: str) -> str:
"""
import elements from source files, add file name
attribute to first element included from each file
"""
doc_set = etree.Element("set")
work_str = ""
list_file = action_file.with_stem(
f".{action_file.with_suffix('').stem}"
).with_suffix(".xmllist")
@@ -81,28 +96,21 @@ def _get_set(action_file: Path, lang: str, parser: etree.XMLParser) -> etree.Ele
if path.with_suffix(f".{lang}.xml").exists()
else path.with_suffix(".en.xml")
)
doc_set.extend(_get_xmls(path_xml, parser))
work_str += _include_xml(path_xml)
return doc_set
return work_str
def _get_document(
action_lang: str, action_file: Path, lang: str, parser: etree.XMLParser
) -> etree.Element:
document = etree.Element(
"document", language=action_lang, **_get_attributes(action_file)
)
document.append(_get_set(action_file, lang, parser))
document.extend(_get_xmls(action_file, parser))
return document
def _build_xmlstream(infile: Path, parser: etree.XMLParser) -> etree.Element:
def _build_xmlstream(infile: Path):
"""
assemble the xml stream for feeding into xsltproc
the expected shortname and language flag indicate
a single xhtml page to be built
"""
# TODO
# Ideally this would use lxml to construct an object instead of string templating.
# Should be a little faster, and also guarantees that its valid xml
logger.debug(f"infile: {infile}")
shortname = infile.with_suffix("")
lang = lang_from_filename(infile)
@@ -126,6 +134,7 @@ def _build_xmlstream(infile: Path, parser: etree.XMLParser) -> etree.Element:
topbanner_xml = Path(f"global/data/topbanner/.topbanner.{lang}.xml")
texts_xml = Path(f"global/data/texts/.texts.{lang}.xml")
date = str(datetime.now().date())
# time = str(datetime.now().time())
action_lang = ""
translation_state = ""
@@ -150,29 +159,41 @@ def _build_xmlstream(infile: Path, parser: etree.XMLParser) -> etree.Element:
action_file = shortname.with_suffix(f".{action_lang}{infile.suffix}")
logger.debug(f"action_file: {action_file}")
# Create the root element
page = etree.Element(
"buildinfo",
date=date,
original=original_lang,
filename=f"/{str(shortname.with_suffix('')).removeprefix('/')}",
fileurl=f"/{shortname.relative_to(shortname.parts[0]).with_suffix('')}",
dirname=f"/{shortname.parent}/",
language=lang,
translation_state=translation_state,
)
# Add the subelements
page.append(_get_trlist(infile))
page.extend(_get_xmls(topbanner_xml, parser))
page.extend(_get_xmls(Path("global/data/texts/texts.en.xml"), parser))
page.extend(_get_xmls(texts_xml, parser))
page.append(_get_document(action_lang, action_file, lang, parser))
return page
result_str = f"""
<buildinfo
date="{date}"
original="{original_lang}"
filename="/{str(shortname.with_suffix("")).removeprefix("/")}"
fileurl="/{shortname.relative_to(shortname.parts[0]).with_suffix("")}"
dirname="/{shortname.parent}/"
language="{lang}"
translation_state="{translation_state}"
>
<trlist>
{_list_langs(infile)}
</trlist>
<topbanner>
{_include_xml(topbanner_xml)}
</topbanner>
<textsetbackup>
{_include_xml(Path("global/data/texts/texts.en.xml"))}
</textsetbackup>
<textset>
{_include_xml(texts_xml)}
</textset>
<document
language="{action_lang}"
{_get_attributes(action_file)}
>
<set>
{_auto_sources(action_file, lang)}
</set>
{_include_xml(action_file)}
</document>
</buildinfo>
"""
return result_str
def process_file(infile: Path, processor: Path) -> str:
@@ -181,54 +202,41 @@ def process_file(infile: Path, processor: Path) -> str:
"""
logger.debug(f"Processing {infile}")
lang = lang_from_filename(infile)
parser = etree.XMLParser(remove_blank_text=True, remove_comments=True)
xmlstream = _build_xmlstream(infile, parser)
xslt_tree = etree.parse(processor.resolve(), parser)
xmlstream = _build_xmlstream(infile)
xslt_tree = etree.parse(processor.resolve())
transform = etree.XSLT(xslt_tree)
result = transform(xmlstream)
result = str(transform(etree.XML(xmlstream)))
# And now a bunch of regexes to fix some links.
# xx is the language code in all comments
try:
for linkelem in result.xpath("//*[@href]"):
# remove any spurious whitespace
linkelem.set(
"href",
linkelem.get("href").strip(),
)
# Remove https://fsfe.org (or https://test.fsfe.org)
# from the start of all links
linkelem.set(
"href",
re.sub(
r"""^(https?://(test\.)?fsfe\.org)""",
"",
linkelem.get("href"),
flags=re.IGNORECASE,
),
)
# Change links from /foo/bar.html into /foo/bar.xx.html
# Change links from foo/bar.html into foo/bar.xx.html
# Same for .rss and .ics links
linkelem.set(
"href",
re.sub(
r"""^(/?([^:>]+/)?[^:/.]{3,}\.)(html|rss|ics)""",
rf"""\1{lang}.\3""",
linkelem.get("href"),
flags=re.IGNORECASE,
),
)
# Change links from /foo/bar/ into /foo/bar/index.xx.html
# Change links from foo/bar/ into foo/bar/index.xx.html
linkelem.set(
"href",
re.sub(
r"""^(/?[^:>]+/)$""",
rf"""\1index.{lang}.html""",
linkelem.get("href"),
flags=re.IGNORECASE,
),
)
except AssertionError:
logger.debug(f"Output generated for file {infile} is not valid xml")
# TODO
# Probably a faster way to do this
# Maybe iterating though all a tags with lxml?
# Once buildxmlstream generates an xml object that should be faster.
# Remove https://fsfe.org (or https://test.fsfe.org) from the start of all
result = re.sub(
r"""href\s*=\s*("|')(https?://(test\.)?fsfe\.org)([^>])\1""",
r"""href=\1\3\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
# Change links from /foo/bar.html into /foo/bar.xx.html
# Change links from foo/bar.html into foo/bar.xx.html
# Same for .rss and .ics links
result = re.sub(
r"""href\s*=\s*("|')(/?([^:>]+/)?[^:/.]+\.)(html|rss|ics)(#[^>]*)?\1""",
rf"""href=\1\2{lang}.\4\5\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
# Change links from /foo/bar/ into /foo/bar/index.xx.html
# Change links from foo/bar/ into foo/bar/index.xx.html
result = re.sub(
r"""href\s*=\s*("|')(/?[^:>]+/)\1""",
rf"""href=\1\2index.{lang}.html\1""",
result,
flags=re.MULTILINE | re.IGNORECASE,
)
return result

View File

@@ -16,5 +16,5 @@ def full() -> None:
"""
logger.info("Performing a full rebuild, git cleaning")
run_command(
["git", "clean", "-fdx", "--exclude", "/.venv", "--exclude", "/.nltk_data"],
["git", "clean", "-fdx", "--exclude", "/.venv"],
)

View File

@@ -45,7 +45,7 @@ def _run_process(
logger.debug(f"Building {target_file}")
result = process_file(source_file, processor)
target_file.parent.mkdir(parents=True, exist_ok=True)
result.write_output(target_file)
target_file.write_text(result)
def _process_dir(

View File

@@ -8,7 +8,7 @@
<!-- ====================================================================== -->
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="text" encoding="utf-8"/>
<xsl:output method="text" encoding="UTF-8"/>
<xsl:template match="version">
<xsl:value-of select="."/>

View File

@@ -3,7 +3,7 @@
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:import href="xslt/drm_info_head.xsl" />
<xsl:import href="xslt/drm_info_body.xsl" />
<xsl:output method="html" omit-xml-declaration="yes" encoding="utf-8" doctype-system="about:legacy-compat" />
<xsl:output method="html" omit-xml-declaration="yes" encoding="utf-8" indent="yes" doctype-system="about:legacy-compat" />
<xsl:include href="../build/xslt/fsfe_document.xsl" />
<xsl:include href="../build/xslt/fsfe_nolocal.xsl" />
</xsl:stylesheet>

View File

@@ -5,7 +5,7 @@
xmlns:str="http://exslt.org/strings"
extension-element-prefixes="str">
<xsl:output method="text" encoding="utf-8" />
<xsl:output method="text" encoding="UTF-8" indent="no" />
<xsl:strip-space elements="body"/>
<!-- new line template -->

View File

@@ -4,7 +4,8 @@
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="utf-8" />
<xsl:output method="xml" encoding="UTF-8" omit-xml-declaration="yes"
indent="yes" />
<!-- ============= -->
<!-- Link handling -->

View File

@@ -22,7 +22,7 @@
<xsl:include href="../build/xslt/peertube.xsl" />
<!-- HTML 5 compatibility doctype, since our XSLT parser doesn't support disabling output escaping -->
<xsl:output method="html" encoding="utf-8" doctype-system="about:legacy-compat" />
<xsl:output method="html" encoding="utf-8" indent="yes" doctype-system="about:legacy-compat" />
<!-- EXTRACT / DESCRIPTION of each page -->
<xsl:variable name="metadesc">

View File

@@ -10,7 +10,7 @@
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:atom="http://www.w3.org/2005/Atom">
<xsl:output method="xml" encoding="utf-8"/>
<xsl:output method="xml" encoding="utf-8" indent="yes"/>
<!-- ====== -->
<!-- Months -->

View File

@@ -3,7 +3,7 @@
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:import href="podcast.rss.xsl" />
<xsl:output method="xml" encoding="utf-8" />
<xsl:output method="xml" encoding="utf-8" indent="yes" />
<xsl:template match="/">
<xsl:apply-templates select="/buildinfo/document">

View File

@@ -15,7 +15,7 @@
<xsl:import href="../../build/xslt/gettext.xsl" />
<xsl:output method="xml" encoding="utf-8"/>
<xsl:output method="xml" encoding="utf-8" indent="yes"/>
<!-- ====== -->
<!-- Months -->

View File

@@ -2,7 +2,7 @@
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" encoding="utf-8" />
<xsl:output method="xml" encoding="UTF-8" indent="yes"/>
<xsl:param name="link"/>

View File

@@ -3,7 +3,7 @@
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:import href="xslt/pdfreaders_head.xsl" />
<xsl:import href="xslt/pdfreaders_body.xsl" />
<xsl:output method="html" omit-xml-declaration="yes" encoding="utf-8" doctype-system="about:legacy-compat" />
<xsl:output method="html" omit-xml-declaration="yes" encoding="utf-8" indent="yes" doctype-system="about:legacy-compat" />
<xsl:include href="../build/xslt/fsfe_document.xsl" />
<xsl:include href="xslt/pdfreaders_list.xsl" />

View File

@@ -5,7 +5,6 @@
},
}:
let
inherit (pkgs) lib;
treefmt-nixSrc = builtins.fetchTarball "https://github.com/numtide/treefmt-nix/archive/refs/heads/master.tar.gz";
treefmt-nix = import treefmt-nixSrc;
in
@@ -17,8 +16,6 @@ in
(with pkgs; [
# For getting python deps
uv
# Need to use a nix python to prevent ssl certs issues
python312
# needed by lxml
libxslt
libxml2
@@ -59,13 +56,5 @@ in
pkgs:
(with pkgs; [
]);
runScript = pkgs.writeShellScript "fsfe-website-env" ''
set -euo pipefail
# Force uv to use Python interpreter from venv
export UV_PYTHON="${lib.getExe pkgs.python312}";
# Prevent uv from downloading managed Python's
export UV_PYTHON_DOWNLOADS="never"
uv venv
bash --rcfile .venv/bin/activate "$@"
'';
# runScript = '''';
}).env